The goal of this project is the investigate what causes Serious and Fatal accidents in hopes of preventing and decreasing the number of them. The dataset consists of accident records from the UK over the course of 15+ years. I hope to show the causes of these accidents through visualizations and create an algorithm that can predict the severity of accidents.
The UK government collects and publishes (usually on an annual basis) detailed information about traffic accidents across the country. This information includes, but is not limited to, geographical locations, weather conditions, type of vehicles, number of casualties and vehicle manoeuvres, making this a very interesting and comprehensive dataset for analysis and research.
The data that I'm using is compiled and available through Kaggle and in a less compliled form, here.
Problem: Severe and fatal accidents. Solution: Use data to figure out how to lower the number of accidents and the severity of them.
Questions:
#Import modules
import numpy as np
import holidays
import pandas as pd
import seaborn as sns
import pickle
import time
import matplotlib.pyplot as plt
plt.style.use('dark_background')
%matplotlib inline
import datetime
import math
#scipy
import scipy
from scipy import stats
from scipy.stats import ttest_ind
#sklearn
import sklearn
from sklearn import ensemble
from sklearn import preprocessing
from sklearn.decomposition import PCA
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.utils import resample
#other learners
from xgboost import XGBClassifier
import lightgbm as lgb
#time series stuff
import statsmodels.api as sm
from pylab import rcParams
import itertools
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.stattools import acf, pacf
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.arima_model import ARIMA
#warning ignorer
import warnings
warnings.filterwarnings("ignore")
#import files
ac = pd.read_csv(r'Accident_Information.csv', low_memory=False, chunksize=30000)
vc = pd.read_csv(r'Vehicle_Information.csv', low_memory=False, chunksize=30000)
Previously, I did not remove "Data missing or out of range" from the datasets however through cleaning and checking the value counts I decided to do so for sanity purposes only. Most of the percentages that had this as a value were not a high percentage either.
#chunk cleaning and dataframing for accident column
acchunk = []
for chunk in ac:
acchunk_filter = chunk[
(chunk.Year.astype(int) >= 2010) &
(chunk.Year.astype(int) <= 2017) &
(chunk['Road_Type'] != "Unknown") &
(chunk['Junction_Control'] != "Data missing or out of range") &
(chunk['Carriageway_Hazards'] != "Data missing or out of range") &
(chunk['Junction_Detail'] != "Data missing or out of range") &
(chunk['Road_Surface_Conditions'] != "Data missing or out of range") &
(chunk['Special_Conditions_at_Site'] != "Data missing or out of range") &
(chunk['Weather_Conditions'] != "Data missing or out of range") &
(chunk['Latitude'].notnull()) &
(chunk['Longitude'].notnull())
]
acchunk.append(acchunk_filter)
df1 = pd.concat(acchunk)
#chunk cleaning for vehicles column
vcchunk = []
for chunk2 in vc:
vcchunk_filter = chunk2[
(chunk2.Year.astype(int) >= 2010)&
(chunk2.Year.astype(int) <= 2017) &
(chunk2['Driver_Home_Area_Type'] != "Data missing or out of range") &
(chunk2['Journey_Purpose_of_Driver'] != "Data missing or out of range") &
(chunk2['Junction_Location'] != "Data missing or out of range") &
(chunk2['Was_Vehicle_Left_Hand_Drive'] != "Data missing or out of range") &
(chunk2['Hit_Object_in_Carriageway'] != "Data missing or out of range") &
(chunk2['Skidding_and_Overturning'] != "Data missing or out of range") &
(chunk2['Towing_and_Articulation'] != "Data missing or out of range") &
(chunk2['Vehicle_Leaving_Carriageway'] != "Data missing or out of range") &
(chunk2['Vehicle_Manoeuvre'] != "Data missing or out of range") &
(chunk2['Vehicle_Type'] != "Data missing or out of range") &
(chunk2['X1st_Point_of_Impact'] != "Data missing or out of range") &
(chunk2['Sex_of_Driver'] != "Data missing or out of range") &
(chunk2['Age_Band_of_Driver'] != "Data missing or out of range")
]
vcchunk.append(vcchunk_filter)
df2 = pd.concat(vcchunk)
#check columns
print("Accident's Columns:\n",df1.columns, "\n")
print("Vehicle's Columns:\n",df2.columns)
print('Accident Shape', df1.shape)
print('Vehicle Shape',df2.shape)
#merge dataframes
df = pd.merge(df1,df2)
#check columns
print("Names of Combined Columns:\n",df.columns, "\n")
print("\nShape:\n",df.shape)
df.describe(include ='all')
#check corr b/t Location_Easting_OSGR & Location_Northing_OSGR AND Longitude and Latitude
print(df['Location_Easting_OSGR'].corr(df['Longitude']))
print(df['Location_Northing_OSGR'].corr(df['Latitude']))
#drop Location_Easting_OSGR & Location_Northing_OSGR
#because they are the similar to Latitude and Longitude
df = df.drop(['Location_Easting_OSGR', 'Location_Northing_OSGR'], axis=1)
df.shape
#standardize all column names to lowercase, and remove some characters
#for ease of use in querying
df.columns = map(str.lower, df.columns)
df.columns = df.columns.str.replace('.','')
df.columns = df.columns.str.replace('(','')
df.columns = df.columns.str.replace(')','')
#convert date/time to datetime datatype
df['date'] = pd.to_datetime((df['date']), format= "%Y-%m-%d")
#df.dtypes
#mistyped datatypes
df[['did_police_officer_attend_scene_of_accident',
'driver_imd_decile','vehicle_reference',
'vehicle_locationrestricted_lane','1st_road_number',
'2nd_road_number','driver_imd_decile',
'pedestrian_crossing-physical_facilities',
'pedestrian_crossing-human_control']]= df[['did_police_officer_attend_scene_of_accident',
'driver_imd_decile','vehicle_reference',
'vehicle_locationrestricted_lane','1st_road_number',
'2nd_road_number','driver_imd_decile',
'pedestrian_crossing-physical_facilities',
'pedestrian_crossing-human_control']].astype('object')
df.columns.to_series().groupby(df.dtypes).groups
df.isnull().sum().sort_values(ascending=False)/df.shape[0]*100
# #2nd_road_class
df['2nd_road_class'].value_counts()/df.shape[0]*100
With 40% of non null being unclassified and 39% of the overall 2nd_road_class column being null, I have decided to drop it in it's entirely.
df = df.drop(['2nd_road_class'], axis=1)
#driver_imd_decile
df['driver_imd_decile'].value_counts()/df.shape[0]*100
df['driver_imd_decile'].hist()
plt.style.use('dark_background')
Since the distribution of categories for 'driver_imd_decile seem very similar, I've decided not to use the mode but "method='ffill'"
df['driver_imd_decile'].fillna(method='ffill', inplace=True)
df['age_of_vehicle'].describe()
df['age_of_vehicle'].median()
Changing the nulls of "age of vehicle" to median, then creating it as a category
#fillna by 7
df['age_of_vehicle'].fillna(7, inplace=True)
#group age_of_vehicle
#1=0-3, 2=3-5, 3=5-8, 4=8-11, 5=
def fixedvehicleage(age):
if age>=0 and age<=120:
return age
else:
return np.nan
df['age_of_vehicle'] = df['age_of_vehicle'].apply(fixedvehicleage)
df['age_of_vehicle'] = pd.cut(df['age_of_vehicle'],
[0,2,5,8,11,14,17,120], labels=['1', '2', '3','4','5','6','7'])
#model
df['model'].value_counts()/df.shape[0]*100
df['model'].describe()
Knowing that there are 28824 unique models for the model column I have decided to use the ffill method on it as well.
df['model'].fillna(method='ffill', inplace=True)
Note: A lot of the values of "model' are labeled as "missing". I do not want to change these because the model could have actually been missing from the car from the accident or it could not be recognizable at the time of the accident.
#engine_capacity_cc
df['engine_capacity_cc'].describe()
df['engine_capacity_cc'].hist()
plt.style.use('dark_background')
I am going to handle both outliers and the null values of engine_capacity_cc using the ideals of quantiles and the interquartile range (IQR).
#first I'm going to handle both ends of outliers.
#(determine the min and max cuttoffs for detecting the outlier)
q75, q25 = np.percentile(df['engine_capacity_cc'].dropna(), [75 ,25])
iqr = q75 - q25
ecmin = q25 - (iqr*1.5)
ecmax = q75 + (iqr*1.5)
print(ecmax)
print(ecmin)
To explain, what I am going to do is use the ecmax number for the maximum engine_capacity_cc and ecmin for my engine_capacity_cc. Then I'm going to take the mean of those and use it as my fillna.
df = df[df['engine_capacity_cc']<=ecmax]
df = df[df['engine_capacity_cc']>=ecmin]
df['engine_capacity_cc'].hist(bins=20)
plt.style.use('dark_background')
I can accept this distribution and will now check and handle their nulls
#check values of 'engine_capacity_cc'
df['engine_capacity_cc'].describe()
df['engine_capacity_cc'].mean()
Going to round this mean value
df['engine_capacity_cc'].fillna(1652, inplace=True)
Note: After doing the above null fixes, propulsion_code dropped from having 10% null values to 0. (see below). I will continue on and fix lsoa_of_accident_location then drop the rest of the null values with are all <5%.
df.isnull().sum().sort_values(ascending=False)/df.shape[0]*100
# #lsoa_of_accident_location
df['lsoa_of_accident_location'].value_counts()
df['lsoa_of_accident_location'].describe()
With 35061 unique variable and a high count amount the top variables I am deciding to do ffill again.
df['lsoa_of_accident_location'].fillna(method='ffill', inplace=True)
#### Check nulls for again
df.isnull().sum().sort_values(ascending=False)/df.shape[0]*100
Dropping the remaining nulls that are <1%.
#drop the remaining nulls that are <1%
df.dropna(inplace=True)
#last check
df.isnull().sum().sort_values(ascending=False)/df.shape[0]*100
df.shape
df.info()
#detecting outliers of numerical columns (all floats/ints excluding lat/long and year)
df_num = df[['engine_capacity_cc','number_of_casualties','number_of_vehicles','speed_limit']]
df_num.hist( bins=25, grid=False, figsize=(12,8))
plt.style.use('dark_background')
Column 'speed_limit' seems ok and was previously altered 'engine_capacity_cc'. However, 'number_of_casualties', and 'number_of_vehicles',will be evaluated.
# #number_of_casualties
df['number_of_casualties'].value_counts()
#create casualities grouping
def casualities(num_cas):
if num_cas >=1 and num_cas <2:
return "1"
elif num_cas >=2 and num_cas <3:
return "2"
elif num_cas >=3 and num_cas <4:
return "3"
elif num_cas >= 4 and num_cas <5:
return "4"
elif num_cas >= 5:
return "5+"
#apply function
df['number_of_casualties']= df['number_of_casualties'].apply(casualities)
#number_of_casualties
df['number_of_casualties'].value_counts()
df['propulsion_code'].value_counts()/df.shape[0]*100
#Clean the values for Propulsion Code.
df['propulsion_code'] = df['propulsion_code'].replace(to_replace="Gas", value="Petrol")
df['propulsion_code'] = df['propulsion_code'].replace(to_replace="Gas/Bi-fuel", value="Bio-fuel")
df['propulsion_code'] = df['propulsion_code'].replace(to_replace="Petrol/Gas (LPG)", value="LPG Petrol")
df['propulsion_code'] = df['propulsion_code'].replace(to_replace="Gas Diesel", value="Diesel")
df['propulsion_code'].value_counts()/df.shape[0]*100
# #unique values
df.nunique().sort_values(ascending=False)
df['date'] = pd.to_datetime(df['date'])
df['month'] = df ['date'].apply(lambda time: time.month)
#creating a weekend feature that includes Friday-Sunday
df['weekend']= np.where(df['day_of_week'].isin(['Friday', 'Saturday', 'Sunday']), 1, 0)
#create time of day feature with Morning Rush, Day, Noon Rush, Afternoon, After Work Rush, Night
#time of day dictionary
timeofdaygroups = {1: "Morning Rush (6-10)",
2: "Day (10-12)",
3: "Lunch Rush (12-14)",
4: "Afternoon (14-16)",
5: "After Work Rush (16-18)",
6: "Evening (18-22)",
7: "Night (22-6)"}
#pull time data and create hour column
df['hour'] = df['time'].str[0:2]
#convert to numeric
df['hour'] = pd.to_numeric(df['hour'])
#convert to integer
df['hour'] = df['hour'].astype('int')
#create time_of_day grouping
def daygroup(hour):
if hour >= 6 and hour < 10:
return "1"
elif hour >= 10 and hour < 12:
return "2"
elif hour >= 12 and hour < 14:
return "3"
elif hour >= 14 and hour < 16:
return "4"
elif hour >= 16 and hour < 18:
return "5"
elif hour >= 18 and hour < 22:
return "6"
else:
return "7"
#apply function
#time of day function
df['time_of_day']= df['hour'].apply(daygroup)
df[['weekend','day_of_week','time', 'time_of_day']].tail(10)
#vehicle_type
df['vehicle_type'].value_counts()/df.shape[0]*100
I want to condense the vehicle type variables.
#motorcycles
df['vehicle_type'] = df['vehicle_type'].replace(to_replace="Motorcycle over 500cc",
value="Motorcycle")
df['vehicle_type'] = df['vehicle_type'].replace(to_replace=
"Motorcycle over 125cc and up to 500cc",
value="Motorcycle")
df['vehicle_type'] = df['vehicle_type'].replace(to_replace="Motorcycle 125cc and under",
value="Motorcycle")
df['vehicle_type'] = df['vehicle_type'].replace(to_replace="Motorcycle 50cc and under",
value="Motorcycle")
df['vehicle_type'] = df['vehicle_type'].replace(to_replace="Electric motorcycle",
value="Motorcycle")
df['vehicle_type'] = df['vehicle_type'].replace(to_replace="Motorcycle - unknown cc",
value="Motorcycle")
#Goods_vehicle
df['vehicle_type'] = df['vehicle_type'].replace(to_replace=
"Van / Goods 3.5 tonnes mgw or under",
value="Goods Vehicle")
df['vehicle_type'] = df['vehicle_type'].replace(to_replace="Goods over 3.5t. and under 7.5t",
value="Goods Vehicle")
df['vehicle_type'] = df['vehicle_type'].replace(to_replace="Goods vehicle - unknown weight",
value="Goods Vehicle")
df['vehicle_type'] = df['vehicle_type'].replace(to_replace="Goods 7.5 tonnes mgw and over",
value="Goods Vehicle")
#car
df['vehicle_type'] = df['vehicle_type'].replace(to_replace="Taxi/Private hire car",
value="Car")
#bus
df['vehicle_type'] = df['vehicle_type'].replace(to_replace="Minibus (8 - 16 passenger seats)",
value="Bus")
df['vehicle_type'] = df['vehicle_type'].replace(to_replace=
"Bus or coach (17 or more pass seats)",
value="Bus")
#other vehicle
df['vehicle_type'] = df['vehicle_type'].replace(to_replace="Agricultural vehicle",
value="Other Vehicle")
df['vehicle_type'] = df['vehicle_type'].replace(to_replace="Other vehicle",
value="Other Vehicle")
#vehicle_type
df['vehicle_type'].value_counts()/df.shape[0]*100
Create more condense groups for age band of driver in order to deal with some potential outliers.
#age_band_of_driver
df['age_band_of_driver'].value_counts()/df.shape[0]*100
#I did this before hand because as "Over 75", it wouldnt convert in the codes below
df['age_band_of_driver']=df['age_band_of_driver'].replace("Over 75","75-100")
age1 = ["0 - 5", "6 - 10", "11 - 15"]
age2 = ["16 - 20","21 - 25"]
age3 = ["26 - 35","36 - 45"]
age4 = ["46 - 55", "56 - 65"]
age5 = ["66 - 75", "75-100"]
#over 75 wouldnt work in the string so I did it separately
for (row, col) in df.iterrows():
if str.lower(col.age_band_of_driver) in age1:
df['age_band_of_driver'].replace(to_replace=col.age_band_of_driver,
value='Under 16', inplace=True)
if str.lower(col.age_band_of_driver) in age2:
df['age_band_of_driver'].replace(to_replace=col.age_band_of_driver,
value='16-25', inplace=True)
if str.lower(col.age_band_of_driver) in age3:
df['age_band_of_driver'].replace(to_replace=col.age_band_of_driver,
value='26-45', inplace=True)
if str.lower(col.age_band_of_driver) in age4:
df['age_band_of_driver'].replace(to_replace=col.age_band_of_driver,
value='46-65', inplace=True)
if str.lower(col.age_band_of_driver) in age5:
df['age_band_of_driver'].replace(to_replace=col.age_band_of_driver,
value='Over 65', inplace=True)
#age_band_of_driver
print("Distinct responses for age_band_of_driver:\n", set(df['age_band_of_driver']))
# number_of_vehicles
df['number_of_vehicles'].value_counts()/df.shape[0]*100
#group number_of_vehicles
def vehicles(num_veh):
if num_veh >=1 and num_veh <2:
return "1"
elif num_veh >=2 and num_veh <3:
return "2"
elif num_veh >=3 and num_veh <4:
return "3"
elif num_veh >= 4:
return "4+"
#apply function
df['number_of_vehicles']= df['number_of_vehicles'].apply(vehicles)
# number_of_vehicles
df['number_of_vehicles'].value_counts()/df.shape[0]*100
df['number_of_vehicles'].dtypes
df['number_of_vehicles']=df['number_of_vehicles'].astype('object')
#creating seasons column for ML
#creating season column
def getSeason(month):
if (month == 12 or month == 1 or month == 2):
return "winter"
elif(month == 3 or month == 4 or month == 5):
return "spring"
elif(month == 6 or month== 7 or month == 8):
return "summer"
else:
return "fall"
df['season'] = df['month'].apply(getSeason)
# number_of_vehicles
df['season'].value_counts()/df.shape[0]*100
#go back to engine capacity CC and crete groups
df.engine_capacity_cc.hist()
def enginecap(eng_cc):
if eng_cc <=1500:
return "small engine cc"
if eng_cc >1500 and eng_cc <=2000:
return "medium engine cc"
if eng_cc >2000:
return "large engine cc"
df['engine_capacity_cc_size'] = df['engine_capacity_cc'].apply(enginecap)
df.engine_capacity_cc_size.value_counts()
#Put above pickle in next full run
#create new column for Machine Learning and Visualization with Not Serious and Serious
df['accident_seriousness'] = df['accident_severity']
df['accident_seriousness'] = df['accident_seriousness'].replace(to_replace="Slight",
value="Not Serious")
df['accident_seriousness'] = df['accident_seriousness'].replace(to_replace="Serious",
value="Serious")
df['accident_seriousness'] = df['accident_seriousness'].replace(to_replace="Fatal",
value="Serious")
df.shape
df.accident_seriousness.value_counts()
#pickling everything to speed up restarting
df.to_pickle("df.pkl")
#import pickled file
df = pd.read_pickle("df.pkl")
df.head()
accidentsperyear = df.groupby(['year'])['accident_index'].count()
# prepare plot
plt.style.use('dark_background')
plt.figure(figsize=(10,5))
colors = sns.color_palette("rainbow", n_colors=7)
sns.barplot(accidentsperyear.index,accidentsperyear.values, palette=colors)
sns.despine(top=True, right=True, left=True, bottom=True)
plt.title("Accidents Per Year",fontsize=20,fontweight="bold")
plt.xlabel("\nYear", fontsize=15, fontweight="bold")
plt.ylabel("\nNumber of Accidents", fontsize=15, fontweight="bold")
plt.savefig('accidentsperyear.png')
plt.tight_layout()
accidentspermonth = df.groupby(['month'])['accident_index'].count()
# prepare plot
plt.style.use('dark_background')
plt.figure(figsize=(20,10))
colors = sns.color_palette("rainbow", n_colors=12)
mt=sns.barplot(accidentspermonth.index,accidentspermonth.values, palette=colors)
sns.despine(top=True, right=True, left=True, bottom=True)
#ax is the axes instance
group_labels = ['Jan', 'Feb','Mar','Apr','May','June','July','Aug','Sept','Oct','Nov','Dec' ]
mt.set_xticklabels(group_labels)
plt.title("Accidents Per Month",fontsize=20,fontweight="bold")
plt.xticks(fontsize=18)
plt.yticks(fontsize=12)
plt.xlabel("\nMonth", fontsize=15, fontweight="bold")
plt.ylabel("\nNumber of Accidents", fontsize=15, fontweight="bold")
plt.savefig('accidentspermonth.png')
plt.tight_layout()
weekdays = ['Sunday', 'Saturday', 'Friday', 'Thursday', 'Wednesday', 'Tuesday', 'Monday']
accweekday = df.groupby(['year', 'day_of_week']).size()
accweekday = accweekday.rename_axis(['year', 'day_of_week'])\
.unstack('day_of_week')\
.reindex(columns=weekdays)
plt.figure(figsize=(15,10))
plt.style.use('dark_background')
sns.heatmap(accweekday, cmap='rainbow')
plt.title('\nAccidents by Weekday per Year\n', fontsize=14, fontweight='bold')
plt.xticks(fontsize=15)
plt.yticks(fontsize=12)
plt.xlabel('')
plt.ylabel('')
plt.savefig('accidentsbyweekdayperyear.png')
plt.show()
Fridays are the day of the week where the most accidents occur.
accidentsperseason = df.groupby(['season'])['accident_index'].count()
# prepare plot
plt.style.use('dark_background')
plt.figure(figsize=(15,10))
colors = ["orangered", "deepskyblue","gold", "indigo"]
sns.barplot(accidentsperseason.index,accidentsperseason.values, palette=colors)
sns.despine(top=True, right=True, left=True, bottom=True)
plt.title("Accidents Per Season",fontsize=20,fontweight="bold")
plt.xticks(fontsize=15)
plt.yticks(fontsize=12)
plt.xlabel("\nSeason", fontsize=15, fontweight="bold")
plt.ylabel("\nNumber of Accidents", fontsize=15, fontweight="bold")
plt.tight_layout()
plt.savefig('accidentsperseason.png')
print(df.columns)
# #change objects to category
# df[df.select_dtypes(['object']).columns] = df.select_dtypes(['object']).apply(lambda x: x.astype('category'))
#correlation by accident severity
corrdf=df.apply(LabelEncoder().fit_transform)
corr=corrdf.corr()['accident_seriousness']
corr[np.argsort(corr,axis=0)[::-1]]
#dataframe where accidents are Slight
not_serious = df[(df['accident_seriousness']=="Not Serious")]
print("Not Serious Group Shape:", not_serious.shape)
not_serious.accident_seriousness.value_counts()
#dataframe where accidents are serious
serious= df[(df['accident_seriousness']=="Serious")]
print("Serious Group Shape:", serious.shape)
serious.accident_seriousness.value_counts()
Looking at this I wanted to visualize some of the higher pos/negative correlations against accident severity.
For my visualizations I have decided to use some of the features with the highest correlations to accident_seriousness:
Note: This columns used were selected because of the absolute value of their correlation in relation to accident_seriousness
*columns added after correlation was done after undersampling
#map 1, 2, 3 in did_police_officer_attend_scene_of_accident with Yes, No,Self-reported
policeattend = {1: "Yes", 2:"No", 3:"Self-Reported"}
not_serious['did_police_officer_attend_scene_of_accident']=not_serious['did_police_officer_attend_scene_of_accident'].map(policeattend)
df['did_police_officer_attend_scene_of_accident']=df['did_police_officer_attend_scene_of_accident'].map(policeattend)
serious['did_police_officer_attend_scene_of_accident']=serious['did_police_officer_attend_scene_of_accident'].map(policeattend)
imddecile = {1:"Most deprived 10%", 2:"More deprived 10-20%", 3:"More deprived 20-30%",
4:"More deprived 30-40%", 5:"More deprived 40-50%", 6:"Less deprived 40-50%",
7:"Less deprived 30-40%", 8:"Less deprived 20-30%", 9:"Less deprived 10-20%",
10:"Least deprived 10%"}
not_serious['driver_imd_decile']=not_serious['driver_imd_decile'].map(imddecile)
df['driver_imd_decile']=df['driver_imd_decile'].map(imddecile)
serious['driver_imd_decile']=serious['driver_imd_decile'].map(imddecile)
#setups for adding frequencies to visualizations
dftotal= float(len(df))
nstotal= float(len(not_serious))
setotal= float(len(serious))
#Did Police Officer Attend Scene Of Accident
plt.figure(figsize=(15,10))
ax = sns.countplot("did_police_officer_attend_scene_of_accident", hue="accident_seriousness",
palette="PuBu", data=not_serious)
plt.title("Did Police Officer Attend Scene Of Not Serious Accident",
fontsize=20, fontweight="bold")
plt.style.use('dark_background')
plt.xlabel("\nAttendance", fontsize=15, fontweight="bold")
plt.legend(fontsize=15, bbox_to_anchor=(1.0, 1), loc='upper right', ncol=1)
plt.ylabel("\nNumber Attended", fontsize=15, fontweight="bold")
for p in ax.patches:
height = p.get_height()
ax.text(p.get_x()+p.get_width()/2.,
height + 3,
'{:1.3f}%'.format(height/nstotal*100),
ha="center",fontsize=15)
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
sns.despine(top=True, right=True, left=True, bottom=True)
plt.savefig('did_police_officer_attend_scene_of_accident_not_serious.png')
plt.show()
#Did Police Officer Attend Scene Of Accident
plt.figure(figsize=(15,10))
ax = sns.countplot("did_police_officer_attend_scene_of_accident", hue="accident_seriousness",
palette="PuBu", data=serious)
plt.title("Did Police Officer Attend Scene Of Serious Accident",
fontsize=20, fontweight="bold")
plt.style.use('dark_background')
plt.xlabel("\nAttendance", fontsize=15, fontweight="bold")
plt.legend(fontsize=15, bbox_to_anchor=(1.0, 1), loc='upper right', ncol=1)
plt.ylabel("\nNumber Attended", fontsize=15, fontweight="bold")
for p in ax.patches:
height = p.get_height()
ax.text(p.get_x()+p.get_width()/2.,
height + 3,
'{:1.3f}%'.format(height/setotal*100),
ha="center",fontsize=15)
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
sns.despine(top=True, right=True, left=True, bottom=True)
plt.savefig('did_police_officer_attend_scene_of_accident_serious.png')
plt.show()
# First Point of Impact Vs Accident Seriousness (Not Serious)
fpoa_order =["Front", "Nearside", "Did not impact", "Back", "Offside"]
plt.figure(figsize=(20,10))
ax = sns.countplot("x1st_point_of_impact", hue="accident_seriousness", order=fpoa_order,
palette="PuBu", data=not_serious)
plt.title("First Point of Impact in Not Serious Accidents",fontsize=20,fontweight="bold")
plt.style.use('dark_background')
plt.xlabel("\nPoint of Impact", fontsize=15, fontweight="bold")
plt.legend(fontsize='15', bbox_to_anchor=(1.04, 1), loc='upper right', ncol=1)
plt.ylabel("\nFirst Point of Impact Count", fontsize=15, fontweight="bold")
for p in ax.patches:
height = p.get_height()
ax.text(p.get_x()+p.get_width()/2.,
height + 3,
'{:1.2f}%'.format(height/nstotal*100),
ha="center",fontsize=15)
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
sns.despine(top=True, right=True, left=True, bottom=True)
plt.savefig('x1st_point_of_impact_not_serious.png')
plt.show()
# First Point of Impact Vs Accident Seriousness
plt.figure(figsize=(20,10))
ax = sns.countplot("x1st_point_of_impact", hue="accident_seriousness", order=fpoa_order,
palette="PuBu", data=serious)
plt.title("First Point of Impact in Serious Accidents",fontsize=20,fontweight="bold")
plt.style.use('dark_background')
plt.xlabel("\nPoint of Impact", fontsize=15, fontweight="bold")
plt.legend(fontsize='15', bbox_to_anchor=(1.04, 1), loc='upper right', ncol=1)
plt.ylabel("\nFirst Point of Impact Count", fontsize=15, fontweight="bold")
for p in ax.patches:
height = p.get_height()
ax.text(p.get_x()+p.get_width()/2.,
height + 3,
'{:1.2f}%'.format(height/setotal*100),
ha="center",fontsize=15)
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
sns.despine(top=True, right=True, left=True, bottom=True)
plt.savefig('x1st_point_of_impact_serious.png')
plt.show()
#number of vehicles vs accidentseriousness
nov_order=["1","2", "3", "4+"]
#notserious
plt.figure(figsize=(20,10))
ax = sns.countplot("accident_seriousness", hue="number_of_vehicles", hue_order=nov_order,
palette="GnBu_d", data=not_serious)
plt.style.use('dark_background')
plt.title("Number of Vehicles in Not Serious Accidents",
fontsize=20, fontweight="bold")
plt.xlabel("\nNumber of Vehicles", fontsize=15, fontweight="bold")
plt.legend().set_title('')
plt.legend(fontsize='15', bbox_to_anchor=(1.04, 1), loc='upper right', ncol=1)
plt.ylabel("\nNumber of Accidents", fontsize=15, fontweight="bold")
for p in ax.patches:
height = p.get_height()
ax.text(p.get_x()+p.get_width()/2.,
height + 3,
'{:1.2f}%'.format(height/nstotal*100),
ha="center",fontsize=15)
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
plt.tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False)
sns.despine(top=True, right=True, left=True, bottom=True)
plt.savefig('number_of_vehicles_not_serious.png')
plt.show()
#serious
plt.figure(figsize=(20,10))
ax = sns.countplot("accident_seriousness", hue="number_of_vehicles", hue_order=nov_order,
palette="GnBu_d", data=serious)
plt.style.use('dark_background')
plt.title("Number of Vehicles in Serious Accidents",
fontsize=20, fontweight="bold")
plt.xlabel("\nNumber of Vehicles", fontsize=15, fontweight="bold")
plt.legend().set_title('')
plt.legend(fontsize='15', bbox_to_anchor=(1.04, 1), loc='upper right', ncol=1)
plt.ylabel("\nNumber of Accidents", fontsize=15, fontweight="bold")
for p in ax.patches:
height = p.get_height()
ax.text(p.get_x()+p.get_width()/2.,
height + 3,
'{:1.2f}%'.format(height/setotal*100),
ha="center",fontsize=15)
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
plt.tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False)
sns.despine(top=True, right=True, left=True, bottom=True)
plt.savefig('number_of_vehicles_serious.png')
plt.show()
#notserious
splt_order=[15.0, 20.0,30.0,40.0 ,50.0,60.0, 70.0]
#splt1_order=[20.0,30.0,40.0 ,50.0,60.0, 70.0]
plt.figure(figsize=(20,10))
ax = sns.countplot("speed_limit", hue="accident_seriousness", order=splt_order,
palette="PuBu", data=not_serious)
plt.title("Speed Limit vs Not Serious Accidents",fontsize=20,fontweight="bold")
plt.style.use('dark_background')
plt.xlabel("\nSpeed Limits", fontsize=15, fontweight="bold")
plt.legend(fontsize='15', bbox_to_anchor=(1.04, 1), loc='upper right', ncol=1)
plt.ylabel("\nCount", fontsize=15, fontweight="bold")
for p in ax.patches:
height = p.get_height()
ax.text(p.get_x()+p.get_width()/2.,
height + 3,
'{:1.4f}%'.format(height/nstotal*100),
ha="center",fontsize=15)
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
sns.despine(top=True, right=True, left=True, bottom=True)
plt.savefig('speed_limit_not_serious.png')
plt.show()
#erious
plt.figure(figsize=(20,10))
ax = sns.countplot("speed_limit", hue="accident_seriousness",
palette="PuBu", data=serious)
plt.title("Speed Limit vs Serious Accidents",fontsize=20,fontweight="bold")
plt.style.use('dark_background')
plt.xlabel("\nSpeed Limits", fontsize=15, fontweight="bold")
plt.legend(fontsize='15', bbox_to_anchor=(1.04, 1), loc='upper right', ncol=1)
plt.ylabel("\nCount", fontsize=15, fontweight="bold")
for p in ax.patches:
height = p.get_height()
ax.text(p.get_x()+p.get_width()/2.,
height + 3,
'{:1.3f}%'.format(height/setotal*100),
ha="center",fontsize=15)
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
sns.despine(top=True, right=True, left=True, bottom=True)
plt.savefig('speed_limit_serious.png')
plt.show()
#urban_or_rural_area vs accident seriousness
plt.figure(figsize=(20,10))
ax = sns.countplot("accident_seriousness", hue="urban_or_rural_area",
palette="PuBu", data=not_serious)
plt.title("Urban or Rural Area vs Accident Severity",fontsize=20,fontweight="bold")
plt.style.use('dark_background')
plt.xlabel("\nSeverity", fontsize=15, fontweight="bold")
plt.legend(fontsize='15', bbox_to_anchor=(1.04, 1), loc='upper right', ncol=1)
plt.ylabel("\nUrban or Rural Area Count", fontsize=15, fontweight="bold")
for p in ax.patches:
height = p.get_height()
ax.text(p.get_x()+p.get_width()/2.,
height + 3,
'{:1.2f}%'.format(height/nstotal*100),
ha="center",fontsize=15)
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
sns.despine(top=True, right=True, left=True, bottom=True)
plt.savefig('urban_or_rural_area_not_serious.png')
plt.show()
#urban_or_rural_area vs accident seriousness
plt.figure(figsize=(20,10))
ax = sns.countplot("accident_seriousness", hue="urban_or_rural_area",
palette="PuBu", data=serious)
plt.title("Urban or Rural Area vs Accident Severity",fontsize=20,fontweight="bold")
plt.style.use('dark_background')
plt.xlabel("\nSeverity", fontsize=15, fontweight="bold")
plt.legend(fontsize='15', bbox_to_anchor=(1.04, 1), loc='upper right', ncol=1)
plt.ylabel("\nUrban or Rural Area Count", fontsize=15, fontweight="bold")
for p in ax.patches:
height = p.get_height()
ax.text(p.get_x()+p.get_width()/2.,
height + 3,
'{:1.2f}%'.format(height/setotal*100),
ha="center",fontsize=15)
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
sns.despine(top=True, right=True, left=True, bottom=True)
plt.savefig('urban_or_rural_area_serious.png')
plt.show()
#Not Serious Accident
sao_order=["None", "Skidded", "Skidded and overturned", "Overturned", "Jackknifed",
"Jackknifed and overturned"]
plt.figure(figsize=(15,10))
ax = sns.countplot("accident_seriousness", hue="skidding_and_overturning", hue_order=sao_order,
palette="magma", data=not_serious)
plt.style.use('dark_background')
plt.title("Skidding and Overturning in Not Serious Accidents",fontsize=25,fontweight="bold")
plt.xlabel("\nNot Serious Skidding and Overturning", fontsize=15, fontweight="bold")
plt.legend().set_title('')
plt.legend(fontsize='15', bbox_to_anchor=(1.04, 1), loc='upper right', ncol=1)
plt.ylabel("\nNumber of Accidents", fontsize=15, fontweight="bold")
for p in ax.patches:
height = p.get_height()
ax.text(p.get_x()+p.get_width()/2.,
height + 3,
'{:1.3f}%'.format(height/nstotal*100),
ha="center",fontsize=15)
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
plt.tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False)
sns.despine(top=True, right=True, left=True, bottom=True)
plt.savefig('skidding_and_overturning_not_serious.png')
plt.show()
#Serious Accident Manuevers
plt.figure(figsize=(15,10))
ax= sns.countplot("accident_seriousness", hue="skidding_and_overturning", hue_order=sao_order,
palette="magma", data=serious)
plt.style.use('dark_background')
plt.title("Skidding and Overturning in Serious Accidents",fontsize=25,fontweight="bold")
plt.xlabel("\nSerious Skidding and Overturning", fontsize=15, fontweight="bold")
plt.legend().set_title('')
plt.legend(fontsize='15', bbox_to_anchor=(1.04, 1), loc='upper right', ncol=1)
plt.ylabel("\nNumber of Accidents", fontsize=15, fontweight="bold")
for p in ax.patches:
height = p.get_height()
ax.text(p.get_x()+p.get_width()/2.,
height + 3,
'{:1.3f}%'.format(height/setotal*100),
ha="center",fontsize=15)
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
plt.tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False)
sns.despine(top=True, right=True, left=True, bottom=True)
plt.savefig('skidding_and_overturning_serious.png')
plt.show()
#Not Serious Accident Manuevers
vlc_order=["Did not leave carriageway", "Straight ahead at junction", "Nearside",
"Offside", "Offside on to central reservation", "Nearside and rebounded",
"Offside - crossed central reservation", "Offside and rebounded",
"Offside on to centrl res + rebounded"]
plt.figure(figsize=(15,10))
ax=sns.countplot("accident_seriousness", hue="vehicle_leaving_carriageway", hue_order=vlc_order,
palette="plasma", data=not_serious)
plt.style.use('dark_background')
plt.title("Vehicle Leaving Carriageway in Not Serious Accidents",fontsize=25,fontweight="bold")
plt.xlabel("\nNot Serious Vehicle Leaving Carriageway ", fontsize=15, fontweight="bold")
plt.legend().set_title('')
plt.legend(fontsize='15', bbox_to_anchor=(1.04, 1), loc='upper right', ncol=1)
plt.ylabel("\nNumber of Accidents\n", fontsize=15, fontweight="bold")
for p in ax.patches:
height = p.get_height()
ax.text(p.get_x()+p.get_width()/2.,
height + 3,
'{:1.3f}%'.format(height/nstotal*100),
ha="center",fontsize=15)
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
plt.tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False)
sns.despine(top=True, right=True, left=True, bottom=True)
plt.savefig('vehicle_leaving_carriageway_not_serious.png')
plt.show()
#Serious Accident Manuevers
plt.figure(figsize=(15,10))
ax=sns.countplot("accident_seriousness", hue="vehicle_leaving_carriageway", hue_order=vlc_order,
palette="plasma", data=serious)
plt.style.use('dark_background')
plt.title("Vehicle Leaving Carriageway in Serious Accidents",fontsize=25,fontweight="bold")
plt.xlabel("\nSerious Vehicle Leaving Carriageway ", fontsize=15, fontweight="bold")
plt.legend().set_title('')
plt.legend(fontsize='15', bbox_to_anchor=(1.04, 1), loc='upper right', ncol=1)
plt.ylabel("\nNumber of Accidents\n", fontsize=15, fontweight="bold")
for p in ax.patches:
height = p.get_height()
ax.text(p.get_x()+p.get_width()/2.,
height + 3,
'{:1.3f}%'.format(height/setotal*100),
ha="center",fontsize=15)
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
plt.tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False)
sns.despine(top=True, right=True, left=True, bottom=True)
plt.savefig('vehicle_leaving_carriageway_serious.png')
plt.show()
#sex_of_driver
sod_order=["Female", "Male", "Not known"]
plt.figure(figsize=(15,10))
ax=sns.countplot("accident_seriousness", hue="sex_of_driver", hue_order=sod_order,
palette="magma", data=not_serious)
plt.style.use('dark_background')
plt.title("Sex of Driver in Not Serious Accidents",fontsize=25,fontweight="bold")
plt.xlabel("\nSex of Driver", fontsize=15, fontweight="bold")
plt.legend().set_title('')
plt.legend(fontsize='15', bbox_to_anchor=(1.04, 1), loc='upper right', ncol=1)
plt.ylabel("\nNumber of Accidents", fontsize=15, fontweight="bold")
for p in ax.patches:
height = p.get_height()
ax.text(p.get_x()+p.get_width()/2.,
height + 3,
'{:1.2f}%'.format(height/nstotal*100),
ha="center",fontsize=15)
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
plt.tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False)
sns.despine(top=True, right=True, left=True, bottom=True)
plt.savefig('sex_of_driver_not_serious.png')
plt.show()
#sex_of_driver serious
plt.figure(figsize=(15,10))
ax=sns.countplot("accident_seriousness", hue="sex_of_driver", hue_order=sod_order,
palette="magma", data=serious)
plt.style.use('dark_background')
plt.title("Sex of Driver in Serious Accidents",fontsize=25,fontweight="bold")
plt.xlabel("\nSex of Driver", fontsize=15, fontweight="bold")
plt.legend().set_title('')
plt.legend(fontsize='15', bbox_to_anchor=(1.04, 1), loc='upper right', ncol=1)
plt.ylabel("\nNumber of Accidents", fontsize=15, fontweight="bold")
for p in ax.patches:
height = p.get_height()
ax.text(p.get_x()+p.get_width()/2.,
height + 3,
'{:1.2f}%'.format(height/setotal*100),
ha="center",fontsize=15)
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
plt.tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False)
sns.despine(top=True, right=True, left=True, bottom=True)
plt.savefig('sex_of_driver_serious.png')
plt.show()
#sex_of_driver
df['sex_of_driver'].value_counts()/df.shape[0]*100
#Not Serious Accident Type
vt_order=['Bus', 'Car', 'Goods Vehicle', 'Motorcycle', 'Other Vehicle']
plt.figure(figsize=(15,10))
ax=sns.countplot("accident_seriousness", hue="vehicle_type", hue_order=vt_order,
palette="tab20", data=not_serious)
plt.style.use('dark_background')
plt.title("Vehicle Type in Not Serious Accidents",fontsize=25,fontweight="bold")
plt.xlabel("\nNot Serious Accidents by Vehicle Type", fontsize=15, fontweight="bold")
plt.legend().set_title('')
plt.legend(fontsize='15', bbox_to_anchor=(1.04, 1), loc='upper right', ncol=1)
plt.ylabel("\nNumber of Accidents", fontsize=15, fontweight="bold")
for p in ax.patches:
height = p.get_height()
ax.text(p.get_x()+p.get_width()/2.,
height + 3,
'{:1.2f}%'.format(height/nstotal*100),
ha="center",fontsize=15)
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
plt.tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False)
sns.despine(top=True, right=True, left=True, bottom=True)
plt.savefig('vehicle_type_not_serious.png')
plt.show()
#Serious Accident Type
plt.figure(figsize=(15,10))
ax=sns.countplot("accident_seriousness", hue="vehicle_type", hue_order=vt_order,
palette="tab20", data=serious)
plt.style.use('dark_background')
plt.title("Vehicle Type in Serious Accidents",fontsize=25,fontweight="bold")
plt.xlabel("\nSerious Accidents by Vehicle Type", fontsize=15, fontweight="bold")
plt.legend().set_title('')
plt.legend(fontsize='15', bbox_to_anchor=(1.04, 1), loc='upper right', ncol=1)
plt.ylabel("\nNumber of Accidents", fontsize=15, fontweight="bold")
for p in ax.patches:
height = p.get_height()
ax.text(p.get_x()+p.get_width()/2.,
height + 3,
'{:1.2f}%'.format(height/setotal*100),
ha="center",fontsize=15)
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
plt.tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False)
sns.despine(top=True, right=True, left=True, bottom=True)
plt.savefig('vehicle_type_serious.png')
plt.show()
#Not Serious Accident Manuevers
vm_order=['Turning right', 'Going ahead other', 'Going ahead right-hand bend',
'Slowing or stopping', 'Turning left', 'Waiting to go - held up',
'Waiting to turn right', 'Overtaking static vehicle - offside' ,
'Parked', 'Overtaking - nearside', 'U-turn', 'Changing lane to right',
'Reversing', 'Waiting to turn left', 'Changing lane to left',
'Going ahead left-hand bend', 'Overtaking moving vehicle - offside', 'Moving off']
plt.figure(figsize=(20,10))
ax=sns.countplot("accident_seriousness", hue="vehicle_manoeuvre", hue_order=vm_order,
palette="tab20", data=not_serious)
plt.style.use('dark_background')
plt.title("Vehicle Manuevers in Not Serious Accidents",fontsize=25,fontweight="bold")
plt.xlabel("\nNot Serious Vehicle Manuevers", fontsize=15, fontweight="bold")
plt.legend().set_title('')
plt.legend(fontsize='15', bbox_to_anchor=(1.04, 1), loc='upper right', ncol=1)
plt.ylabel("\nNumber of Accidents", fontsize=15, fontweight="bold")
for p in ax.patches:
height = p.get_height()
ax.text(p.get_x()+p.get_width()/2.,
height + 3,
'{:1.2f}%'.format(height/nstotal*100),
ha="center",fontsize=15)
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
plt.tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False)
sns.despine(top=True, right=True, left=True, bottom=True)
plt.savefig('vehicle_manoeuvre_not_serious.png')
plt.show()
#Serious Accident Manuevers
plt.figure(figsize=(20,10))
ax=sns.countplot("accident_seriousness", hue="vehicle_manoeuvre",hue_order=vm_order,
palette="tab20", data=serious)
plt.style.use('dark_background')
plt.title("Vehicle Manuevers in Serious Accidents",fontsize=25,fontweight="bold")
plt.xlabel("\nSerious Vehicle Manuevers", fontsize=15, fontweight="bold")
plt.legend().set_title('')
plt.legend(fontsize='15', bbox_to_anchor=(1.04, 1), loc='upper right', ncol=1)
plt.ylabel("\nNumber of Accidents", fontsize=15, fontweight="bold")
for p in ax.patches:
height = p.get_height()
ax.text(p.get_x()+p.get_width()/2.,
height + 3,
'{:1.2f}%'.format(height/setotal*100),
ha="center",fontsize=15)
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
plt.tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False)
sns.despine(top=True, right=True, left=True, bottom=True)
plt.savefig('vehicle_manoeuvre_serious.png')
plt.show()
#driver_home_area_type
dhoa_order=['Urban area', 'Rural', 'Small town']
#Serious Accident Driver Home Type Area
plt.figure(figsize=(20,15))
ax= sns.countplot("accident_seriousness", hue="driver_home_area_type", hue_order=dhoa_order,
palette="rainbow", data=not_serious)
plt.style.use('dark_background')
plt.title("Accident Driver Home Type Area in Not Serious Accidents",fontsize=25,fontweight="bold")
plt.xlabel("\nSeriousness", fontsize=15, fontweight="bold")
plt.legend().set_title('')
plt.legend(fontsize='22', loc = 'upper right')
plt.ylabel("\nNumber of Accidents", fontsize=15, fontweight="bold")
for p in ax.patches:
height = p.get_height()
ax.text(p.get_x()+p.get_width()/2.,
height + 3,
'{:1.2f}%'.format(height/nstotal*100),
ha="center",fontsize=15)
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
plt.legend(fontsize='15', bbox_to_anchor=(1.04, 1), loc='upper right', ncol=1)
#plt.tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False)
sns.despine(top=True, right=True, left=True, bottom=True)
plt.savefig('driver_home_area_type_not_serious.png')
plt.show()
#driver_home_area_type
#Serious Accident Driver Home Type Area
plt.figure(figsize=(20,15))
ax= sns.countplot("accident_seriousness", hue="driver_home_area_type", hue_order=dhoa_order,
palette="rainbow", data=serious)
plt.style.use('dark_background')
plt.title("Accident Driver Home Type Area in Serious Accidents",fontsize=25,fontweight="bold")
plt.xlabel("\nSeriousness", fontsize=15, fontweight="bold")
plt.legend().set_title('')
plt.legend(fontsize='22', loc = 'upper right')
plt.ylabel("\nNumber of Accidents", fontsize=15, fontweight="bold")
for p in ax.patches:
height = p.get_height()
ax.text(p.get_x()+p.get_width()/2.,
height + 3,
'{:1.2f}%'.format(height/setotal*100),
ha="center",fontsize=15)
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
plt.legend(fontsize='15', bbox_to_anchor=(1.04, 1), loc='upper right', ncol=1)
#plt.tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False)
sns.despine(top=True, right=True, left=True, bottom=True)
plt.savefig('driver_home_area_type_serious.png')
plt.show()
#age_band_of_driver
abod_order=['Under 16', '16-25', '26-45', '46-65','Over 65']
#Not Serious Accident age_band_of_driver
plt.figure(figsize=(20,15))
ax=sns.countplot("accident_seriousness", hue="age_band_of_driver", hue_order=abod_order,
palette="magma", data=not_serious)
plt.style.use('dark_background')
plt.title("Not Serious Accident by Age Band of Driver",fontsize=25,fontweight="bold")
plt.xlabel("\nNot Serious Accident by Age Band of Driver", fontsize=15, fontweight="bold")
plt.legend().set_title('')
plt.legend(fontsize='22', loc = 'upper right')
plt.ylabel("\nNumber of Accidents", fontsize=15, fontweight="bold")
for p in ax.patches:
height = p.get_height()
ax.text(p.get_x()+p.get_width()/2.,
height + 3,
'{:1.2f}%'.format(height/nstotal*100),
ha="center",fontsize=15)
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
plt.legend(fontsize='15', bbox_to_anchor=(1.04, 1), loc='upper right', ncol=1)
#plt.tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False)
sns.despine(top=True, right=True, left=True, bottom=True)
plt.savefig('age_band_of_driver_not_serious.png')
plt.show()
#Serious Accident age_band_of_driver
plt.figure(figsize=(20,15))
ax=sns.countplot("accident_seriousness", hue="age_band_of_driver", hue_order=abod_order,
palette="magma", data=serious)
plt.style.use('dark_background')
plt.title("Serious Accident by Age Band of Driver",fontsize=25,fontweight="bold")
plt.xlabel("\nSerious Accident by Age Band of Driver", fontsize=15, fontweight="bold")
plt.legend().set_title('')
plt.legend(fontsize='22', loc = 'upper right')
plt.ylabel("\nNumber of Accidents", fontsize=15, fontweight="bold")
for p in ax.patches:
height = p.get_height()
ax.text(p.get_x()+p.get_width()/2.,
height + 3,
'{:1.2f}%'.format(height/setotal*100),
ha="center",fontsize=15)
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
plt.legend(fontsize='15', bbox_to_anchor=(1.04, 1), loc='upper right', ncol=1)
#plt.tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False)
sns.despine(top=True, right=True, left=True, bottom=True)
plt.savefig('age_band_of_driver_serious.png')
plt.show()
#junction_control
jc_order = ['Give way or uncontrolled', 'Auto traffic signal', 'Authorised person',
'Stop sign','Not at junction or within 20 metres']
#Not Serious Accident junction_control
plt.figure(figsize=(20,15))
ax=sns.countplot("accident_seriousness", hue="junction_control", hue_order=jc_order,
palette="magma", data=not_serious)
plt.style.use('dark_background')
plt.title("Not Serious Accident by Junction Control",fontsize=25,fontweight="bold")
plt.xlabel("\nNot Serious Accident by Junction Control", fontsize=15, fontweight="bold")
plt.legend().set_title('')
plt.legend(fontsize='22', loc = 'upper right')
plt.ylabel("\nNumber of Accidents", fontsize=15, fontweight="bold")
for p in ax.patches:
height = p.get_height()
ax.text(p.get_x()+p.get_width()/2.,
height + 3,
'{:1.2f}%'.format(height/nstotal*100),
ha="center",fontsize=15)
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
plt.legend(fontsize='15', bbox_to_anchor=(1.04, 1), loc='upper right', ncol=1)
plt.tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False)
sns.despine(top=True, right=True, left=True, bottom=True)
plt.savefig('junction_control_not_serious.png')
plt.show()
#Serious Accident junction_control
plt.figure(figsize=(20,15))
ax=sns.countplot("accident_seriousness", hue="junction_control",hue_order=jc_order,
palette="magma", data=serious)
plt.style.use('dark_background')
plt.title("Serious Accident by Junction Control",fontsize=25,fontweight="bold")
plt.xlabel("\nSerious Accident by Junction Control", fontsize=15, fontweight="bold")
plt.legend().set_title('')
plt.legend(fontsize='22', loc = 'upper right')
plt.ylabel("\nNumber of Accidents", fontsize=15, fontweight="bold")
for p in ax.patches:
height = p.get_height()
ax.text(p.get_x()+p.get_width()/2.,
height + 3,
'{:1.2f}%'.format(height/setotal*100),
ha="center",fontsize=15)
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
plt.legend(fontsize='15', bbox_to_anchor=(1.04, 1), loc='upper right', ncol=1)
plt.tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False)
sns.despine(top=True, right=True, left=True, bottom=True)
plt.savefig('junction_control_serious.png')
plt.show()
#hit_object_off_carriageway
hooffc_order=['None', 'Lamp post', 'Road sign or traffic signal', 'Other permanent object',
'Entered ditch', 'Tree', 'Near/Offside crash barrier','Central crash barrier',
'Bus stop or bus shelter', 'Telegraph or electricity pole', 'Submerged in water',
'Wall or fence']
#Not Serious Accident hit_object_off_carriageway
plt.figure(figsize=(20,15))
ax=sns.countplot("accident_seriousness", hue="hit_object_off_carriageway", hue_order=hooffc_order,
palette="plasma", data=not_serious)
plt.style.use('dark_background')
plt.title("Not Serious Accident by Hit Object Off Carriageway",fontsize=25,fontweight="bold")
plt.xlabel("\nNot Serious Accident by Hit Object Off Carriageway", fontsize=15, fontweight="bold")
plt.legend().set_title('')
plt.legend(fontsize='22', loc = 'upper right')
plt.ylabel("\nNumber of Accidents", fontsize=15, fontweight="bold")
for p in ax.patches:
height = p.get_height()
ax.text(p.get_x()+p.get_width()/2.,
height + 3,
'{:1.2f}%'.format(height/nstotal*100),
ha="center",fontsize=15)
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
plt.legend(fontsize='15', bbox_to_anchor=(1.04, 1), loc='upper right', ncol=1)
plt.tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False)
sns.despine(top=True, right=True, left=True, bottom=True)
plt.savefig('hit_object_off_carriageway_not_serious.png')
plt.show()
#Serious Accident hit_object_off_carriageway
plt.figure(figsize=(20,15))
ax=sns.countplot("accident_seriousness", hue="hit_object_off_carriageway", hue_order=hooffc_order,
palette="plasma", data=serious)
plt.style.use('dark_background')
plt.title("Serious Accident by Hit Object Off Carriageway",fontsize=25,fontweight="bold")
plt.xlabel("\nSerious Accident by Hit Object Off Carriageway", fontsize=15, fontweight="bold")
plt.legend().set_title('')
plt.legend(fontsize='22', loc = 'upper right')
plt.ylabel("\nNumber of Accidents", fontsize=15, fontweight="bold")
for p in ax.patches:
height = p.get_height()
ax.text(p.get_x()+p.get_width()/2.,
height + 3,
'{:1.2f}%'.format(height/setotal*100),
ha="center",fontsize=15)
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
plt.legend(fontsize='15', bbox_to_anchor=(1.04, 1), loc='upper right', ncol=1)
plt.tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False)
sns.despine(top=True, right=True, left=True, bottom=True)
plt.savefig('hit_object_off_carriageway_serious.png')
plt.show()
#hit_object_in_carriageway
hoinc_order=['None', 'Kerb', 'Other object', 'Bollard or refuge', 'Parked vehicle',
'Road works', 'Open door of vehicle', 'Central island of roundabout',
'Previous accident', 'Bridge (side)', 'Any animal (except ridden horse)',
'Bridge (roof)']
#Not Serious Accident hit_object_in_carriageway
plt.figure(figsize=(20,15))
ax=sns.countplot("accident_seriousness", hue="hit_object_in_carriageway", hue_order=hoinc_order,
palette="plasma", data=not_serious)
plt.style.use('dark_background')
plt.title("Not Serious Accident by Hit Object in Carriageway",fontsize=25,fontweight="bold")
plt.xlabel("\nNot Serious Accident by Hit Object in Carriageway", fontsize=15, fontweight="bold")
plt.legend().set_title('')
plt.legend(fontsize='22', loc = 'upper right')
plt.ylabel("\nNumber of Accidents", fontsize=15, fontweight="bold")
for p in ax.patches:
height = p.get_height()
ax.text(p.get_x()+p.get_width()/2.,
height + 3,
'{:1.2f}%'.format(height/nstotal*100),
ha="center",fontsize=15)
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
plt.legend(fontsize='15', bbox_to_anchor=(1.04, 1), loc='upper right', ncol=1)
plt.tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False)
sns.despine(top=True, right=True, left=True, bottom=True)
plt.savefig('hit_object_in_carriageway_not_serious.png')
plt.show()
#Serious Accident hit_object_in_carriageway
plt.figure(figsize=(20,15))
ax=sns.countplot("accident_seriousness", hue="hit_object_in_carriageway", hue_order=hoinc_order,
palette="plasma", data=serious)
plt.style.use('dark_background')
plt.title("Serious Accident by Hit Object in Carriageway",fontsize=25,fontweight="bold")
plt.xlabel("\nSerious Accident by Hit Object in Carriageway", fontsize=15, fontweight="bold")
plt.legend().set_title('')
plt.legend(fontsize='22', loc = 'upper right')
plt.ylabel("\nNumber of Accidents", fontsize=15, fontweight="bold")
for p in ax.patches:
height = p.get_height()
ax.text(p.get_x()+p.get_width()/2.,
height + 3,
'{:1.2f}%'.format(height/setotal*100),
ha="center",fontsize=15)
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
plt.legend(fontsize='15', bbox_to_anchor=(1.04, 1), loc='upper right', ncol=1)
plt.tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False)
sns.despine(top=True, right=True, left=True, bottom=True)
plt.savefig('hit_object_in_carriageway_serious.png')
plt.show()
#driver_imd_decile
imd_order=["Least deprived 10%", "Less deprived 10-20%", "Less deprived 20-30%",
"Less deprived 30-40%","Less deprived 40-50%","Most deprived 10%",
"More deprived 10-20%", "More deprived 20-30%", "More deprived 30-40%",
"More deprived 40-50%"]
#Not Serious Accident driver_imd_decile
plt.figure(figsize=(20,15))
ax=sns.countplot("accident_seriousness", hue="driver_imd_decile", hue_order=imd_order,
palette="plasma", data=not_serious)
plt.style.use('dark_background')
plt.title("Not Serious Accident by Driver Area Deprivation Score",fontsize=25,fontweight="bold")
plt.xlabel("\nNot Serious Accident by Driver Area Deprivation Score", fontsize=15, fontweight="bold")
plt.legend().set_title('')
plt.legend(fontsize='22', loc = 'upper right')
plt.ylabel("\nNumber of Accidents", fontsize=15, fontweight="bold")
for p in ax.patches:
height = p.get_height()
ax.text(p.get_x()+p.get_width()/2.,
height + 3,
'{:1.2f}%'.format(height/nstotal*100),
ha="center",fontsize=15)
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
plt.legend(fontsize='15', bbox_to_anchor=(1.04, 1), loc='upper right', ncol=1)
plt.tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False)
sns.despine(top=True, right=True, left=True, bottom=True)
plt.savefig('driver_imd_decile_not_serious.png')
plt.show()
#Serious Accident driver_imd_decile
plt.figure(figsize=(20,15))
ax=sns.countplot("accident_seriousness", hue="driver_imd_decile", hue_order=imd_order,
palette="plasma", data=serious)
plt.style.use('dark_background')
plt.title("Serious Accident by Driver Area Deprivation Score",fontsize=25,fontweight="bold")
plt.xlabel("\nSerious Accident by Driver Area Deprivation Score", fontsize=15, fontweight="bold")
plt.legend().set_title('')
plt.legend(fontsize='22', loc = 'upper right')
plt.ylabel("\nNumber of Accidents", fontsize=15, fontweight="bold")
for p in ax.patches:
height = p.get_height()
ax.text(p.get_x()+p.get_width()/2.,
height + 3,
'{:1.2f}%'.format(height/setotal*100),
ha="center",fontsize=15)
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
plt.legend(fontsize='15', bbox_to_anchor=(1.04, 1), loc='upper right', ncol=1)
plt.tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False)
sns.despine(top=True, right=True, left=True, bottom=True)
plt.savefig('driver_imd_decile_serious.png')
plt.show()
#junction_detail
jud_order=['T or staggered junction', 'Mini-roundabout', 'Crossroads',
'Private drive or entrance', 'More than 4 arms (not roundabout)',
'Roundabout', 'Slip road', 'Other junction','Not at junction or within 20 metres']
#Not Serious Accident junction_detail
plt.figure(figsize=(20,15))
ax=sns.countplot("accident_seriousness", hue="junction_detail", hue_order=jud_order,
palette="plasma", data=not_serious)
plt.style.use('dark_background')
plt.title("Not Serious Accident by Junction Detail",fontsize=25,fontweight="bold")
plt.xlabel("\nNot Serious Accident by Junction Detail", fontsize=15, fontweight="bold")
plt.legend().set_title('')
plt.legend(fontsize='22', loc = 'upper right')
plt.ylabel("\nNumber of Accidents", fontsize=15, fontweight="bold")
for p in ax.patches:
height = p.get_height()
ax.text(p.get_x()+p.get_width()/2.,
height + 3,
'{:1.2f}%'.format(height/nstotal*100),
ha="center",fontsize=15)
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
plt.legend(fontsize='15', bbox_to_anchor=(1.04, 1), loc='upper right', ncol=1)
plt.tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False)
sns.despine(top=True, right=True, left=True, bottom=True)
plt.savefig('junction_detail_not_serious.png')
plt.show()
#Serious Accident junction_detail
plt.figure(figsize=(20,15))
ax=sns.countplot("accident_seriousness", hue="junction_detail", hue_order=jud_order,
palette="plasma", data=serious)
plt.style.use('dark_background')
plt.title("Serious Accident by Junction Detail",fontsize=25,fontweight="bold")
plt.xlabel("\nSerious Accident by Junction Detail", fontsize=15, fontweight="bold")
plt.legend().set_title('')
plt.legend(fontsize='22', loc = 'upper right')
plt.ylabel("\nNumber of Accidents", fontsize=15, fontweight="bold")
for p in ax.patches:
height = p.get_height()
ax.text(p.get_x()+p.get_width()/2.,
height + 3,
'{:1.2f}%'.format(height/setotal*100),
ha="center",fontsize=15)
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
plt.legend(fontsize='15', bbox_to_anchor=(1.04, 1), loc='upper right', ncol=1)
plt.tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False)
sns.despine(top=True, right=True, left=True, bottom=True)
plt.savefig('junction_detail_serious.png')
plt.show()
#junction_location
jul_order=['Mid Junction - on roundabout or on main road', 'Entering main road',
'Approaching junction or waiting/parked at junction approach',
'Cleared junction or waiting/parked at junction exit', 'Leaving main road',
'Leaving roundabout', 'Entering roundabout', 'Entering from slip road',
'Not at or within 20 metres of junction']
#Not Serious Accident junction_location
plt.figure(figsize=(20,15))
ax=sns.countplot("accident_seriousness", hue="junction_location", hue_order=jul_order,
palette="plasma", data=not_serious)
plt.style.use('dark_background')
plt.title("Not Serious Accident by Junction Location",fontsize=25,fontweight="bold")
plt.xlabel("\nNot Serious Accident by Junction Location", fontsize=15, fontweight="bold")
plt.legend().set_title('')
plt.legend(fontsize='22', loc = 'upper right')
plt.ylabel("\nNumber of Accidents", fontsize=15, fontweight="bold")
for p in ax.patches:
height = p.get_height()
ax.text(p.get_x()+p.get_width()/2.,
height + 3,
'{:1.2f}%'.format(height/nstotal*100),
ha="center",fontsize=15)
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
plt.legend(fontsize='15', bbox_to_anchor=(1.04, 1), loc='upper right', ncol=1)
plt.tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False)
sns.despine(top=True, right=True, left=True, bottom=True)
plt.savefig('junction_location_not_serious.png')
plt.show()
#Serious Accident junction_location
plt.figure(figsize=(20,15))
ax=sns.countplot("accident_seriousness", hue="junction_location", hue_order=jul_order,
palette="plasma", data=serious)
plt.style.use('dark_background')
plt.title("Serious Accident by Junction Location",fontsize=25,fontweight="bold")
plt.xlabel("\nSerious Accident by Junction Location", fontsize=15, fontweight="bold")
plt.legend().set_title('')
plt.legend(fontsize='22', loc = 'upper right')
plt.ylabel("\nNumber of Accidents", fontsize=15, fontweight="bold")
for p in ax.patches:
height = p.get_height()
ax.text(p.get_x()+p.get_width()/2.,
height + 3,
'{:1.2f}%'.format(height/setotal*100),
ha="center",fontsize=15)
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
plt.legend(fontsize='15', bbox_to_anchor=(1.04, 1), loc='upper right', ncol=1)
plt.tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False)
sns.despine(top=True, right=True, left=True, bottom=True)
plt.savefig('junction_location_serious.png')
plt.show()
#propulsion_code
pd_order=['Petrol', 'Heavy oil', 'Hybrid electric', 'Bio-fuel', 'LPG Petrol', 'Diesel',
'Fuel cells', 'New fuel technology', 'Electric diesel']
pd_order2=['Petrol', 'Heavy oil', 'Hybrid electric', 'Bio-fuel', 'LPG Petrol', 'Electric diesel']
#Not Serious Accident propulsion_code
plt.figure(figsize=(20,15))
ax=sns.countplot("accident_seriousness", hue="propulsion_code", hue_order=pd_order,
palette="plasma", data=not_serious)
plt.style.use('dark_background')
plt.title("Not Serious Accident by Propulsion Code",fontsize=25,fontweight="bold")
plt.xlabel("\nNot Serious Accident by Propulsion Code", fontsize=15, fontweight="bold")
plt.legend().set_title('')
plt.legend(fontsize='22', loc = 'upper right')
plt.ylabel("\nNumber of Accidents", fontsize=15, fontweight="bold")
for p in ax.patches:
height = p.get_height()
ax.text(p.get_x()+p.get_width()/2.,
height + 3,
'{:1.2f}%'.format(height/nstotal*100),
ha="center",fontsize=15)
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
plt.legend(fontsize='15', bbox_to_anchor=(1.04, 1), loc='upper right', ncol=1)
plt.tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False)
sns.despine(top=True, right=True, left=True, bottom=True)
plt.savefig('propulsion_code_not_serious.png')
plt.show()
#Serious Accident propulsion_code
plt.figure(figsize=(20,15))
ax=sns.countplot("accident_seriousness", hue="propulsion_code", hue_order=pd_order2,
palette="plasma", data=serious)
plt.style.use('dark_background')
plt.title("Serious Accident by Propulsion Code",fontsize=25,fontweight="bold")
plt.xlabel("\nSerious Accident by Propulsion Code", fontsize=15, fontweight="bold")
plt.legend().set_title('')
plt.legend(fontsize='22', loc = 'upper right')
plt.ylabel("\nNumber of Accidents", fontsize=15, fontweight="bold")
for p in ax.patches:
height = p.get_height()
ax.text(p.get_x()+p.get_width()/2.,
height + 3,
'{:1.2f}%'.format(height/setotal*100),
ha="center",fontsize=15)
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
plt.legend(fontsize='15', bbox_to_anchor=(1.04, 1), loc='upper right', ncol=1)
plt.tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False)
sns.despine(top=True, right=True, left=True, bottom=True)
plt.savefig('propulsion_code_serious.png')
plt.show()
#yeare
year_order=[2010, 2011, 2012, 2013, 2014, 2015, 2016]
#Not Serious Accident yeare
plt.figure(figsize=(20,15))
ax=sns.countplot("accident_seriousness", hue="year", hue_order=year_order,
palette="plasma", data=not_serious)
plt.style.use('dark_background')
plt.title("Not Serious Accident by Year",fontsize=25,fontweight="bold")
plt.xlabel("\nNot Serious Accident by Year", fontsize=15, fontweight="bold")
plt.legend().set_title('')
plt.legend(fontsize='22', loc = 'upper right')
plt.ylabel("\nNumber of Accidents", fontsize=15, fontweight="bold")
for p in ax.patches:
height = p.get_height()
ax.text(p.get_x()+p.get_width()/2.,
height + 3,
'{:1.2f}%'.format(height/nstotal*100),
ha="center",fontsize=15)
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
plt.legend(fontsize='15', bbox_to_anchor=(1.04, 1), loc='upper right', ncol=1)
plt.tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False)
sns.despine(top=True, right=True, left=True, bottom=True)
plt.savefig('year_not_serious.png')
plt.show()
#Serious Accident year
plt.figure(figsize=(20,15))
ax=sns.countplot("accident_seriousness", hue="year", hue_order=year_order,
palette="plasma", data=serious)
plt.style.use('dark_background')
plt.title("Serious Accident by Year",fontsize=25,fontweight="bold")
plt.xlabel("\nSerious Accident by Year", fontsize=15, fontweight="bold")
plt.legend().set_title('')
plt.legend(fontsize='22', loc = 'upper right')
plt.ylabel("\nNumber of Accidents", fontsize=15, fontweight="bold")
for p in ax.patches:
height = p.get_height()
ax.text(p.get_x()+p.get_width()/2.,
height + 3,
'{:1.2f}%'.format(height/setotal*100),
ha="center",fontsize=15)
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
plt.legend(fontsize='15', bbox_to_anchor=(1.04, 1), loc='upper right', ncol=1)
plt.tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False)
sns.despine(top=True, right=True, left=True, bottom=True)
plt.savefig('year_serious.png')
plt.show()
Due to the previous visualization a comparison of certain variables was desired to see more correlations. The comparisons listed above will be displayed in the outputs to follow.
#Not Serious Accident
plt.figure(figsize=(20,15))
ax=sns.countplot("junction_control", hue="junction_detail",
palette="plasma", data=df)
plt.style.use('dark_background')
plt.title("Junction Control by Junction Detail",fontsize=25,fontweight="bold")
plt.xlabel("\nAccident by Year", fontsize=15, fontweight="bold")
plt.legend().set_title('')
plt.legend(fontsize='22', loc = 'upper right')
plt.ylabel("\nNumber of Accidents", fontsize=15, fontweight="bold")
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
plt.legend(fontsize='15', bbox_to_anchor=(1.04, 1), loc='upper right', ncol=1)
# plt.tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False)
sns.despine(top=True, right=True, left=True, bottom=False)
plt.savefig('junction_control_by_junction_detail.png')
plt.show()
plt.figure(figsize=(20,15))
ax=sns.countplot("junction_control", hue="junction_location",
palette="plasma", data=df)
plt.style.use('dark_background')
plt.title("Junction Control by Junction Location in Accidents",fontsize=25,fontweight="bold")
plt.xlabel("\nAccident by Year", fontsize=15, fontweight="bold")
plt.legend().set_title('')
plt.legend(fontsize='22', loc = 'upper right')
plt.ylabel("\nNumber of Accidents", fontsize=15, fontweight="bold")
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
plt.legend(fontsize='15', bbox_to_anchor=(1.04, 1), loc='upper right', ncol=1)
# plt.tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False)
sns.despine(top=True, right=True, left=True, bottom=False)
plt.savefig('junction_control_by_junction_location.png')
plt.show()
plt.figure(figsize=(20,15))
ax=sns.countplot("x1st_point_of_impact", hue="junction_detail",
palette="plasma", data=df)
plt.style.use('dark_background')
plt.title("First point of Impact by Junction Detail",fontsize=25,fontweight="bold")
plt.xlabel("\nAccident by Year", fontsize=15, fontweight="bold")
plt.legend().set_title('')
plt.legend(fontsize='22', loc = 'upper right')
plt.ylabel("\nNumber of Accidents", fontsize=15, fontweight="bold")
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
plt.legend(fontsize='15', bbox_to_anchor=(1.04, 1), loc='upper right', ncol=1)
# plt.tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False)
sns.despine(top=True, right=True, left=True, bottom=False)
plt.savefig('x1st_point_of_impact_by_junction_detail.png')
plt.show()
plt.figure(figsize=(20,15))
ax=sns.countplot("x1st_point_of_impact", hue="junction_location",
palette="plasma", data=df)
plt.style.use('dark_background')
plt.title("First point of Impact by Junction Location",fontsize=25,fontweight="bold")
plt.xlabel("\nAccident by Year", fontsize=15, fontweight="bold")
plt.legend().set_title('')
plt.legend(fontsize='22', loc = 'upper right')
plt.ylabel("\nNumber of Accidents", fontsize=15, fontweight="bold")
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
plt.legend(fontsize='15', bbox_to_anchor=(1.04, 1), loc='upper right', ncol=1)
# plt.tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False)
sns.despine(top=True, right=True, left=True, bottom=False)
plt.savefig('x1st_point_of_impact_by_junction_location.png')
plt.show()
plt.figure(figsize=(20,15))
ax=sns.countplot("x1st_point_of_impact", hue="junction_control",
palette="plasma", data=df)
plt.style.use('dark_background')
plt.title("First point of Impact by Junction Control",fontsize=25,fontweight="bold")
plt.xlabel("\nAccident by Year", fontsize=15, fontweight="bold")
plt.legend().set_title('')
plt.legend(fontsize='22', loc = 'upper right')
plt.ylabel("\nNumber of Accidents", fontsize=15, fontweight="bold")
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
plt.legend(fontsize='15', bbox_to_anchor=(1.04, 1), loc='upper right', ncol=1)
# plt.tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False)
sns.despine(top=True, right=True, left=True, bottom=False)
plt.savefig('x1st_point_of_impact_by_junction_control.png')
plt.show()
No matter the situation above, the most accidents were involving areas that were uncontrolled. One of the main ones were the junction Detail T or staggered junction.
Other areas of concern include Mid Junctions on roundabouts or main roads and areas approaching a junction were cars were either parking or waiting in the junction.
From the data above more controlled areas would be benefical. Maybe signs alerting drivers of the upcoming junctions, traffic lights, or stop signs would help in some of these areas where they are feasible.
#made separate dataframe w. set index that wouldnt effect data vis above
df1=df
#set index to accident_index
df1.set_index('accident_index', inplace=True)
df1.head()
df1 = df1.apply(LabelEncoder().fit_transform)
Undersampling is done because of the extreme unevenness and bias of the data.
#First set up of X and Y
X= df1.drop(['accident_severity','accident_seriousness'],axis=1)
y= df1['accident_seriousness']
# setting up testing and training sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=27)
# concatenate our training data back together
X = pd.concat([X_train, y_train], axis=1)
# separate minority and majority classes
not_severe = X[X.accident_seriousness==0]
severe = X[X.accident_seriousness==1]
# decrease majority
not_severe_decreased = resample(not_severe,
replace=True, # sample with replacement
n_samples=len(severe), # match number in majority class
random_state=27) # reproducible results
# combine majority and severe_increased minority
newdf = pd.concat([severe, not_severe_decreased])
newdf.accident_seriousness.value_counts()
X_train = newdf.drop('accident_seriousness', axis=1)
y_train = newdf.accident_seriousness
#scaling
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
#Decision Tree Classifier
dtc = DecisionTreeClassifier(random_state=42)
dtc.fit(X_train, y_train)
pred_dtc = dtc.predict(X_test)
#Check accuracy
print("Decision Tree Classifier Accuracy Score: {:0.2f}%".format(accuracy_score(y_test,
pred_dtc )*100))
print("Decision Tree Classifier F1 Score: {:0.2f}%".format(f1_score(y_test,
pred_dtc,average="macro")*100))
print("Decision Tree Classifier Precision Score: {:0.2f}%".format(precision_score(y_test,
pred_dtc,
average="macro")*100))
print("Decision Tree Classifier Recall Score: {:0.2f}%".format(recall_score(y_test,
pred_dtc,
average="macro")*100))
print("Decision Tree Classifier Cross Validation Score: {:0.2f}%".format(np.mean(cross_val_score(dtc,
X_train,
y_train,
cv=5)*100)))
print('\n')
print("Decision Tree Classifier Confusion Matrix:\n", confusion_matrix(y_test,pred_dtc))
#Bagging Classifier
bagc = BaggingClassifier(random_state=42)
bagc.fit(X_train, y_train)
pred_bagc = bagc.predict(X_test)
#Check accuracy
print("Bagging Classifier Accuracy Score: {:0.2f}%".format(accuracy_score(y_test,
pred_bagc )*100))
print("Bagging Classifier F1 Score: {:0.2f}%".format(f1_score(y_test,
pred_bagc,average="macro")*100))
print("Bagging Classifier Precision Score: {:0.2f}%".format(precision_score(y_test,
pred_bagc,
average="macro")*100))
print("Bagging Classifier Recall Score: {:0.2f}%".format(recall_score(y_test,
pred_bagc,
average="macro")*100))
print("Bagging Classifier Cross Validation Score: {:0.2f}%"
.format(np.mean(cross_val_score(bagc, X_train, y_train, cv=5)*100)))
print('\n')
print("Bagging Classifier Confusion Matrix:\n", confusion_matrix(y_test,pred_bagc))
#ExtraTreesClassifier
extc = ExtraTreesClassifier(random_state=42)
extc.fit(X_train, y_train)
pred_extc = extc.predict(X_test)
#Check accuracy
print("Extra Trees Classifier Accuracy Score: {:0.2f}%"
.format(accuracy_score(y_test,pred_extc )*100))
print("Extra Trees Classifier F1 Score: {:0.2f}%"
.format(f1_score(y_test, pred_extc,average="macro")*100))
print("Extra Trees Classifier Precision Score: {:0.2f}%"
.format(precision_score(y_test, pred_extc, average="macro")*100))
print("Extra Trees Classifier Recall Score: {:0.2f}%"
.format(recall_score(y_test, pred_extc, average="macro")*100))
print("Extra Trees Classifier Cross Validation Score: {:0.2f}%"
.format(np.mean(cross_val_score(extc, X_train, y_train, cv=5)*100)))
print('\n')
print("Extra Trees Classifier Confusion Matrix:\n", confusion_matrix(y_test,pred_extc))
#AdaBoost Classifier
adbc = AdaBoostClassifier(random_state=42)
adbc.fit(X_train, y_train)
pred_adbc = adbc.predict(X_test)
#Check accuracy
#Check accuracy
print("AdaBoost Classifier Accuracy Score: {:0.2f}%"
.format(accuracy_score(y_test,pred_adbc )*100))
print("AdaBoost Classifier F1 Score: {:0.2f}%"
.format(f1_score(y_test, pred_adbc,average="macro")*100))
print("AdaBoost Classifier Precision Score: {:0.2f}%"
.format(precision_score(y_test, pred_adbc, average="macro")*100))
print("AdaBoost Classifier Recall Score: {:0.2f}%"
.format(recall_score(y_test, pred_adbc, average="macro")*100))
print("AdaBoost Classifier Cross Validation Score: {:0.2f}%"
.format(np.mean(cross_val_score(adbc, X_train, y_train, cv=5)*100)))
print('\n')
print("AdaBoost Classifier Confusion Matrix:\n", confusion_matrix(y_test,pred_adbc))
#Random Forest Classifier
rfc = RandomForestClassifier(random_state = 42)
rfc.fit(X_train, y_train)
pred_rfc = rfc.predict(X_test)
#Check accuracy
print("Random Forest Classifier Accuracy Score: {:0.2f}%"
.format(accuracy_score(y_test,pred_rfc )*100))
print("Random Forest Classifier F1 Score: {:0.2f}%"
.format(f1_score(y_test, pred_rfc,average="macro")*100))
print("Random Forest Classifier Precision Score: {:0.2f}%"
.format(precision_score(y_test, pred_rfc, average="macro")*100))
print("Random Forest Classifier Recall Score: {:0.2f}%"
.format(recall_score(y_test, pred_rfc, average="macro")*100))
print("Random Forest Classifier Cross Validation Score: {:0.2f}%"
.format(np.mean(cross_val_score(rfc, X_train, y_train, cv=5)*100)))
print('\n')
print("Random Forest Classifier Confusion Matrix:\n", confusion_matrix(y_test,pred_rfc))
#Gradient Boosting Classifier
gbc = ensemble.GradientBoostingClassifier(random_state = 42)
gbc.fit(X_train, y_train)
pred_gbc = gbc.predict(X_test)
#Check accuracy
print("Gradient Boosting Classifier Accuracy Score: {:0.2f}%"
.format(accuracy_score(y_test,pred_gbc )*100))
print("Gradient Boosting Classifier F1 Score: {:0.2f}%"
.format(f1_score(y_test, pred_gbc,average="macro")*100))
print("Gradient Boosting Classifier Precision Score: {:0.2f}%"
.format(precision_score(y_test, pred_gbc, average="macro")*100))
print("Gradient Boosting Classifier Recall Score: {:0.2f}%"
.format(recall_score(y_test, pred_gbc, average="macro")*100))
print("Gradient Boosting Classifier Cross Validation Score: {:0.2f}%"
.format(np.mean(cross_val_score(gbc, X_train, y_train, cv=5)*100)))
print('\n')
print("Gradient Boosting Classifier Confusion Matrix:\n", confusion_matrix(y_test,pred_gbc))
#Light GBM
lgbm = lgb.LGBMClassifier(random_state = 42)
lgbm.fit(X_train, y_train)
pred_lgbm = lgbm.predict(X_test)
#check accuracy
print("LightGBM Classifier Accuracy Score: {:0.2f}%"
.format(accuracy_score(y_test,pred_lgbm )*100))
print("LightGBM Classifier F1 Score: {:0.2f}%"
.format(f1_score(y_test, pred_lgbm,average="macro")*100))
print("LightGBM Classifier Precision Score: {:0.2f}%"
.format(precision_score(y_test, pred_lgbm, average="macro")*100))
print("LightGBM Classifier Recall Score: {:0.2f}%"
.format(recall_score(y_test, pred_lgbm, average="macro")*100))
print("LightGBM Classifier Cross Validation Score: {:0.2f}%"
.format(np.mean(cross_val_score(lgbm, X_train, y_train, cv=5)*100)))
print('\n')
print("LightGBM Classifier Confusion Matrix:\n", confusion_matrix(y_test,pred_lgbm))
#XGBoost
xgb = XGBClassifier(n_estimators=100, random_state = 42, max_depth=10)
xgb.fit(X_train, y_train)
pred_xgb = xgb.predict(X_test)
#check accuracy
print("XGBoost Classifier Accuracy Score: {:0.2f}%"
.format(accuracy_score(y_test,pred_xgb)*100))
print("XGBoost Classifier F1 Score: {:0.2f}%"
.format(f1_score(y_test, pred_xgb,average="macro")*100))
print("XGBoost Classifier Precision Score: {:0.2f}%"
.format(precision_score(y_test, pred_xgb, average="macro")*100))
print("XGBoost Classifier Recall Score: {:0.2f}%"
.format(recall_score(y_test, pred_xgb, average="macro")*100))
print("XGBoost Classifier Cross Validation Score: {:0.2f}%"
.format(np.mean(cross_val_score(xgb, X_train, y_train, cv=5)*100)))
print('\n')
print("XGBoost Classifier Confusion Matrix:\n", confusion_matrix(y_test,pred_xgb))
# #RANDOM FOREST PARAM
# rfc_param = {
# 'n_estimators': [100, 200, 300, 500],
# 'criterion': ['entropy', 'gini'],
# 'max_features':['auto','sqrt'],
# 'max_depth': [10, 50, 100],
# 'min_samples_split': [2, 5, 10],
# 'min_samples_leaf': [1, 2, 4, 10],
# 'random_state':[42]}
# grid_rfc = GridSearchCV(rfc, param_grid = rfc_param, cv = 3, verbose = 1, n_jobs=-1)
# grid_rfc.fit(X_train,y_train)
# print(rfcbest_estimator = grid_rfc.best_estimator_)
# print("Random Forest:\n",grid_rfc.best_params_)
# #Gradient Boosting Classifier Tuning
# gbcparam= {'learning_rate':[0.5,0.1,1],
# 'n_estimators': [100, 200, 300, 500],
# 'max_features':['auto','sqrt'],
# 'max_depth': [10, 50, 100],
# 'min_samples_leaf': [1, 2, 4, 10],
# 'min_samples_split': [2, 5, 10],
# 'random_state':[42]}
# gbctuning =GridSearchCV(gbc, param_grid = gbcparam, cv = 3, verbose = 1, n_jobs=-1)
# gbctuning.fit(X_train,y_train)
# print("Gradient Boost:\n",gbctuning.best_params_)
#LightGBM Tuning
lgbmparam={'learning_rate':[0.5,0.1,1],
'n_estimators': [100, 200, 300, 500],
'max_depth': [6, 25, 50,100],
"num_leaves": [6,12,50],
'min_data_in_leaf' : [100,500,1000],
'random_state':[42]}
lgbmtuning =GridSearchCV(lgbm, param_grid = lgbmparam, cv = 3, n_jobs=1, verbose = 1)
lgbmtuning.fit(X_train,y_train)
print("LightGBM:\n",lgbmtuning.best_params_)
#XGBoost Tuning
xgbparam ={'max_depth': [10, 50, 100],}
rfc2 = RandomForestClassifier(criterion='entropy', max_depth=100,
max_features='sqrt', min_samples_split=8,
n_estimators=500, random_state=42)
rfc2.fit(X_train, y_train)
pred_rfc2 = rfc2.predict(X_test)
#Check accuracy
#Check accuracy
print("Random Forest Classifier Accuracy Score: {:0.2f}%"
.format(accuracy_score(y_test,pred_rfc2 )*100))
print("Random Forest Classifier F1 Score: {:0.2f}%"
.format(f1_score(y_test, pred_rfc2,average="macro")*100))
print("Random Forest Classifier Precision Score: {:0.2f}%"
.format(precision_score(y_test, pred_rfc2, average="macro")*100))
print("Random Forest Classifier Recall Score: {:0.2f}%"
.format(recall_score(y_test, pred_rfc2, average="macro")*100))
print("Random Forest Classifier Cross Validation Score: {:0.2f}%"
.format(np.mean(cross_val_score(rfc2, X_train, y_train, cv=5)*100)))
print('\n')
print("Random Forest Classifier Confusion Matrix:\n", confusion_matrix(y_test,pred_rfc2))
#Gradient Boosting Classifier
gbc2 = ensemble.GradientBoostingClassifier(learning_rate=0.05, max_depth=8,
min_samples_leaf=1, n_estimators=500,
random_state = 42)
gbc2.fit(X_train, y_train)
pred_gbc2 = gbc2.predict(X_test)
#Check accuracy
print("Gradient Boosting Classifier Accuracy Score: {:0.2f}%"
.format(accuracy_score(y_test,pred_gbc2 )*100))
print("Gradient Boosting Classifier F1 Score: {:0.2f}%"
.format(f1_score(y_test, pred_gbc2,average="macro")*100))
print("Gradient Boosting Classifier Precision Score: {:0.2f}%"
.format(precision_score(y_test, pred_gbc2, average="macro")*100))
print("Gradient Boosting Classifier Recall Score: {:0.2f}%"
.format(recall_score(y_test, pred_gbc2, average="macro")*100))
print("Gradient Boosting Classifier Cross Validation Score: {:0.2f}%"
.format(np.mean(cross_val_score(gbc2, X_train, y_train, cv=5)*100)))
print('\n')
print("Gradient Boosting Classifier Confusion Matrix:\n", confusion_matrix(y_test,pred_gbc2))
#Light GBM
#LightGBM:{'learning_rate': 0.1, 'max_depth': 25, 'min_data_in_leaf': 100,
#'n_estimators': 500, 'num_leaves': 50, 'random_state': 42}
lgbm2 = lgb.LGBMClassifier(learning_rate =0.03, max_depth=40, min_data_in_leaf=10,
max_cat_threshold=99999999,
n_estimators=500, num_leaves=50, random_state = 42)
lgbm2.fit(X_train, y_train)
pred_lgbm2 = lgbm2.predict(X_test)
#check accuracy
print("LightGBM Classifier Accuracy Score: {:0.2f}%"
.format(accuracy_score(y_test,pred_lgbm2 )*100))
print("LightGBM Classifier F1 Score: {:0.2f}%"
.format(f1_score(y_test, pred_lgbm2,average="macro")*100))
print("LightGBM Classifier Precision Score: {:0.2f}%"
.format(precision_score(y_test, pred_lgbm2, average="macro")*100))
print("LightGBM Classifier Recall Score: {:0.2f}%"
.format(recall_score(y_test, pred_lgbm2, average="macro")*100))
print("LightGBM Classifier Cross Validation Score: {:0.2f}%"
.format(np.mean(cross_val_score(lgbm2, X_train, y_train, cv=5)*100)))
print('\n')
print("LightGBM Classifier Confusion Matrix:\n", confusion_matrix(y_test,pred_lgbm2))
dftime=df
dftime['accident_index']=dftime.index
dftime.index=dftime['date']
monthly_count = dftime.accident_index.resample('M').count()
mov_mean = monthly_count.rolling(window=12).mean()
mov_std = monthly_count.rolling(window=12).std()
plt.figure(figsize=(20, 10))
main = plt.plot(monthly_count, color='purple', label='Regular')
rmean = plt.plot(mov_mean, color='red', label='Rolling Mean')
rstd = plt.plot(mov_std, color='yellow', label='Rolling Std')
plt.legend(fontsize='15',loc='best')
plt.title('Rolling Mean & Standard Deviation')
plt.show()
#stationary check
rcParams['figure.figsize'] = 20, 10
decomposition = sm.tsa.seasonal_decompose(monthly_count, model='additive')
fig = decomposition.plot()
plt.show()
#AR Model
#making order=(2,1,0) gives RSS=1.5023
model = ARIMA(monthly_count, order=(2,1,0))
results_AR = model.fit(disp=-1)
plt.plot(datasetLogDiffShifting)
plt.plot(results_AR.fittedvalues, color='red')
plt.title('RSS: %.4f'%sum((results_AR.fittedvalues - datasetLogDiffShifting['#Passengers'])**2))
print('Plotting AR model')